library(h2o)
library(tidyverse)
library(plotly)
# Not in container
library(recipes)
library(embed)
data_prepared_tbl <- read_rds("00_data/data_prepared_tbl.rds")
data_prepared_tbl
## # A tibble: 4,998 x 64
## class Attr1 Attr2 Attr3 Attr4 Attr5 Attr6 Attr7 Attr8 Attr9 Attr10
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.0882 0.555 0.0113 1.02 -66.5 0.342 0.109 0.578 1.09 0.320
## 2 0 0.130 0.221 0.578 3.61 120. 0.188 0.162 3.06 1.14 0.677
## 3 0 0.0482 0.550 0.108 1.24 -23.0 0 0.0593 0.817 1.52 0.450
## 4 0 0.0995 0.600 0.375 1.65 19.0 0.211 0.124 0.667 1.10 0.400
## 5 0 0.0785 0.205 0.104 2.79 77.8 0.365 0.0934 3.87 1.23 0.795
## 6 0 0.125 0.354 0.314 2.71 17.9 0.306 0.158 1.82 1.24 0.646
## 7 0 0.185 0.340 0.383 2.13 54.4 0.630 0.231 1.84 1.15 0.626
## 8 0 0.0905 0.314 0.425 3.21 24.9 0.0557 0.105 2.15 1.05 0.676
## 9 0 -0.00213 0.251 0.351 2.48 31.9 0.124 0.00590 2.34 1.06 0.586
## 10 0 0.136 0.296 0.477 2.61 70.9 0.414 0.169 2.34 1.16 0.692
## # … with 4,988 more rows, and 53 more variables: Attr11 <dbl>, Attr12 <dbl>,
## # Attr13 <dbl>, Attr14 <dbl>, Attr15 <dbl>, Attr16 <dbl>, Attr17 <dbl>,
## # Attr18 <dbl>, Attr19 <dbl>, Attr20 <dbl>, Attr21 <dbl>, Attr22 <dbl>,
## # Attr23 <dbl>, Attr24 <dbl>, Attr25 <dbl>, Attr26 <dbl>, Attr27 <dbl>,
## # Attr28 <dbl>, Attr29 <dbl>, Attr30 <dbl>, Attr31 <dbl>, Attr32 <dbl>,
## # Attr33 <dbl>, Attr34 <dbl>, Attr35 <dbl>, Attr36 <dbl>, Attr38 <dbl>,
## # Attr39 <dbl>, Attr40 <dbl>, Attr41 <dbl>, Attr42 <dbl>, Attr43 <dbl>,
## # Attr44 <dbl>, Attr45 <dbl>, Attr46 <dbl>, Attr47 <dbl>, Attr48 <dbl>,
## # Attr49 <dbl>, Attr50 <dbl>, Attr51 <dbl>, Attr52 <dbl>, Attr53 <dbl>,
## # Attr54 <dbl>, Attr55 <dbl>, Attr56 <dbl>, Attr57 <dbl>, Attr58 <dbl>,
## # Attr59 <dbl>, Attr60 <dbl>, Attr61 <dbl>, Attr62 <dbl>, Attr63 <dbl>,
## # Attr64 <dbl>
data_dictionary_raw_tbl <- read_rds("00_data/data_dictionary_raw_tbl.rds")
data_dictionary_tbl <- data_dictionary_raw_tbl %>%
separate(
`Attribute.Information:`,
into = c("id", "desc"),
sep = " ",
extra = "merge"
) %>%
mutate(id = str_replace(id, "X", "Attr"))
data_dictionary_tbl
## # A tibble: 64 x 2
## id desc
## <chr> <chr>
## 1 Attr1 net profit / total assets
## 2 Attr2 total liabilities / total assets
## 3 Attr3 working capital / total assets
## 4 Attr4 current assets / short-term liabilities
## 5 Attr5 [(cash + short-term securities + receivables - short-term liabilities…
## 6 Attr6 retained earnings / total assets
## 7 Attr7 EBIT / total assets
## 8 Attr8 book value of equity / total liabilities
## 9 Attr9 sales / total assets
## 10 Attr10 equity / total assets
## # … with 54 more rows
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 17 hours 15 minutes
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.30.0.1
## H2O cluster version age: 4 months and 1 day !!!
## H2O cluster name: H2O_started_from_R_mdancho_hfm505
## H2O cluster total nodes: 1
## H2O cluster total memory: 7.67 GB
## H2O cluster total cores: 12
## H2O cluster allowed cores: 12
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 4.0.2 (2020-06-22)
path <- file.path(rprojroot::find_rstudio_root_file(),
"00_production_model/PROD_H2O_MODEL")
h2o_model <- h2o.loadModel(path)
predictions_tbl <- h2o.predict(h2o_model, newdata = as.h2o(data_prepared_tbl)) %>%
as_tibble()
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
predictions_tbl
## # A tibble: 4,998 x 3
## predict p0 p1
## <fct> <dbl> <dbl>
## 1 0 0.994 0.00584
## 2 0 0.998 0.00155
## 3 0 0.988 0.0119
## 4 0 0.994 0.00636
## 5 0 0.997 0.00292
## 6 0 0.997 0.00306
## 7 0 0.998 0.00215
## 8 0 0.998 0.00244
## 9 0 0.998 0.00237
## 10 0 0.997 0.00296
## # … with 4,988 more rows
recipe_spec <- recipe(class ~ ., data_prepared_tbl) %>%
step_normalize(contains("Attr")) %>%
step_umap(contains("Attr"), outcome = vars(class), num_comp = 3, seed = c(123, 123))
umap_data_tbl <- recipe_spec %>% prep() %>% juice()
umap_data_tbl
## # A tibble: 4,998 x 4
## class umap_1 umap_2 umap_3
## <fct> <dbl> <dbl> <dbl>
## 1 0 -1.59 2.67 0.828
## 2 0 1.18 0.856 1.25
## 3 0 -1.08 2.74 0.0844
## 4 0 -1.22 0.332 1.39
## 5 0 -0.369 2.18 0.802
## 6 0 0.198 1.89 1.08
## 7 0 0.765 1.36 1.37
## 8 0 0.633 0.486 2.25
## 9 0 -0.0246 0.287 1.43
## 10 0 0.558 1.27 1.15
## # … with 4,988 more rows
# Create tooltip/Hover ----
plot_data_tbl <- umap_data_tbl %>%
bind_cols(
data_prepared_tbl %>%
rowid_to_column(var = ".id") %>%
select(.id, Attr39, Attr56, Attr26, Attr22),
predictions_tbl
) %>%
mutate(tooltip = str_glue(
"
Company ID: {.id}
Class: {class}
Bankruptcy Probability: {scales::percent(p1, accuracy = 0.1)}
Attr 39 Profit on Sales / Sales: {Attr39}
Attr 26 (net profit + depreciation) / total liabilities: {Attr26}
Attr 22 profit on operating activities / total assets: {Attr22}
Attr56 (sales - cost of products sold) / sales: {Attr56}
"
))
# Plotly Visualization ----
plot_data_tbl %>%
plot_ly(x = ~ umap_1, y = ~ umap_2, z = ~ umap_3,
color = ~ class, colors = c('#BF382A', '#0C4B8E'),
hovertemplate = ~ tooltip) %>%
add_markers(opacity = 0.5)